{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loopy: Counting Operations\n",
    "\n",
    "## Setup code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pyopencl as cl\n",
    "import pyopencl.array\n",
    "import pyopencl.clrandom\n",
    "import loopy as lp\n",
    "\n",
    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Choose platform:\n",
      "[0] <pyopencl.Platform 'Portable Computing Language' at 0x7f53c17c66e8>\n",
      "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x289ca58>\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choice [0]: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Set the environment variable PYOPENCL_CTX='0' to avoid being asked again.\n"
     ]
    }
   ],
   "source": [
    "ctx = cl.create_some_context(interactive=True)\n",
    "queue = cl.CommandQueue(ctx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "n = 1024\n",
    "a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)\n",
    "b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Operation-counting matrix multiplication"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is the simple matrix-matrix multiplication kernel again:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "knl = lp.make_kernel(\n",
    "    \"{[i,j,k]: 0<=i,j,k<n}\",\n",
    "    \"c[i, j] = sum(k, a[i, k]*b[k, j])\",\n",
    "    target=lp.PyOpenCLTarget(queue.device))\n",
    "knl = lp.add_and_infer_dtypes(knl, {\"a\": np.float32, \"b\":np.float32})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Counting flops\n",
    "\n",
    "Let us determine the number of arithmetic operations being carried out:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{Op(np:dtype('float32'), add, subgroup): PwQPolynomial(\"[n] -> { n^3 : n > 0 }\"), Op(np:dtype('float32'), mul, subgroup): PwQPolynomial(\"[n] -> { n^3 : n > 0 }\")}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lp.get_op_map(knl)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The return type is easy to evaluate for a given set of parameters--just use the `.eval_with_dict` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3375"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "omap = lp.get_op_map(knl).filter_by(name=[\"add\"], dtype=[np.float32])\n",
    "\n",
    "omap.eval_and_sum({\"n\": 15})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Counting memory access"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/andreas/src/loopy/loopy/statistics.py:1272: UserWarning: get_inames_domain did not get a frozenset\n",
      "  inames_domain = knl.get_inames_domain(insn_inames)\n",
      "/home/andreas/src/loopy/loopy/statistics.py:1327: LoopyWarning: in kernel loopy_kernel: get_insn_count: when counting instruction insn_0 with count_granularity=subgroup, using upper bound for work-group size (1 work-items) to compute sub-groups per work-group. When multiple device programs present, actual sub-group count may belower. (add 'insn_count_subgroups_upper_bound' to silenced_warnings kernel attribute to disable)\n",
      "  warn_with_kernel(knl, \"insn_count_subgroups_upper_bound\",\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup): PwQPolynomial(\"[n] -> { n^3 : n > 0 }\"), MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup): PwQPolynomial(\"[n] -> { n^3 : n > 0 }\"), MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup): PwQPolynomial(\"[n] -> { n^2 : n > 0 }\")}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lp.get_mem_access_map(knl)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Operation-counting a transformed kernel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int8, char) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int16, short) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int32, int) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int64, long)\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    { \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        if (a<0) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m            a = a - (b-1); \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        return a\u001b[39;49;00m\u001b[36m/\u001b[39;49;00m\u001b[36mb; \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    }\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_DEFINE_FLOOR_DIV_POS_B\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_CALL_WITH_INTEGER_TYPES\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[36mfloat\u001b[39;49;00m acc_k_outer_k_inner;\n",
      "\n",
      "  acc_k_outer_k_inner = \u001b[34m0.0f\u001b[39;49;00m;\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k_outer = \u001b[34m0\u001b[39;49;00m; k_outer <= loopy_floor_div_pos_b_int32(-\u001b[34m16\u001b[39;49;00m + n, \u001b[34m16\u001b[39;49;00m); ++k_outer)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k_inner = \u001b[34m0\u001b[39;49;00m; k_inner <= \u001b[34m15\u001b[39;49;00m; ++k_inner)\n",
      "      acc_k_outer_k_inner = acc_k_outer_k_inner + a[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * k_outer + k_inner] * b[n * (\u001b[34m16\u001b[39;49;00m * k_outer + k_inner) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = acc_k_outer_k_inner;\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "opt_knl = knl\n",
    "opt_knl = lp.assume(opt_knl, \"n mod 16 = 0\")\n",
    "opt_knl = lp.split_iname(opt_knl, \"i\", 16, outer_tag=\"g.0\", inner_tag=\"l.1\")\n",
    "opt_knl = lp.split_iname(opt_knl, \"j\", 16, outer_tag=\"g.1\", inner_tag=\"l.0\")\n",
    "opt_knl = lp.split_iname(opt_knl, \"k\", 16)\n",
    "#opt_knl = lp.add_prefetch(opt_knl, \"a\", \"i_inner,k_inner\")\n",
    "#opt_knl = lp.add_prefetch(opt_knl, \"b\", \"j_inner,k_inner\")\n",
    "\n",
    "opt_knl = lp.set_options(opt_knl, write_cl=True)\n",
    "_ = opt_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now count the memory accesses in the transformed version:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{MemAccess(global, np:dtype('float32'), {1: Variable('n')}, {0: Product((Variable('n'), 16))}, load, a, None, subgroup): PwQPolynomial(\"[n] -> { 256 * n * floor((15 + n)/16)^3 : 0 < n <= 16; 4096 * floor((15 + n)/16)^3 : n >= 17 }\"), MemAccess(global, np:dtype('float32'), {0: 1}, {1: 16}, load, b, None, workitem): PwQPolynomial(\"[n] -> { n^3 * floor((15 + n)/16)^3 : 0 < n <= 16; 4096 * floor((15 + n)/16)^3 : n >= 17 }\"), MemAccess(global, np:dtype('float32'), {0: 1, 1: Variable('n')}, {0: Product((Variable('n'), 16)), 1: 16}, store, c, None, workitem): PwQPolynomial(\"[n] -> { n^2 * floor((15 + n)/16)^2 : 0 < n <= 16; 256 * floor((15 + n)/16)^2 : n >= 17 }\")}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#clear\n",
    "lp.get_mem_access_map(opt_knl)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now enable the prefetch transformation above."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
